
#########################################################
# Convenient functions for  read data and pre-process
# 18.01.2021
# Ahmed Hemedan
#########################################################
readData <- function(inFile ) {
  library(DESeq2)
  x <- read.csv(inFile)
  if(dim(x)[2] <= 2 )
    x <- read.table(inFile, sep="\t",header=TRUE)	
  #-------Remove non-numeric columns, except the first column
  for(i in 2:dim(x)[2])
    dataType = c( dataType, is.numeric(x[,i]) )
  if(sum(dataType) <=2) return (NULL) 
  x <- x[,dataType]  # only keep numeric columns
  # rows with all missing values
  ix = which( apply(x[,-1],1, function(y) sum( is.na(y) ) ) != dim(x)[2]-1 )
  x <- x[ix,]
  dataSizeOriginal = dim(x); dataSizeOriginal[2] = dataSizeOriginal[2] -1
  x[,1] <- toupper(x[,1])
  x[,1] <- gsub(" ","",x[,1]) # remove spaces in gene ids
  x = x[order(- apply(x[,2:dim(x)[2]],1,sd) ),]  # sort by SD
  x <- x[!duplicated(x[,1]) ,]  # remove duplicated IDs
  rownames(x) <- x[,1]
  x <- as.matrix(x[,c(-1)])
  # remove "-" or "." from sample names
  colnames(x) = gsub("-","",colnames(x))
  colnames(x) = gsub("\\.","",colnames(x))
  #cat("\nhere",dim(x))
  # missng value for median value
  if(sum(is.na(x))>0) {# if there is missing values
    if(input_missingValue =="geneMedian") { 
      rowMedians <- apply(x,1, function (y)  median(y,na.rm=T))
      for( i in 1:dim(x)[2] ) {
        ix = which(is.na(x[,i]) )
        x[ix,i] <- rowMedians[ix]						
      }
      
    } else if(input_missingValue =="treatAsZero") {
      x[is.na(x) ] <- 0					
    } else if (input_missingValue =="geneMedianInGroup") {
      sampleGroups = detectGroups( colnames(x))
      for (group in unique( sampleGroups) ){		
        samples = which( sampleGroups == group )
        rowMedians <- apply(x[,samples, drop=F],1, function (y)  median(y,na.rm=T))
        for( i in  samples ) { 
          ix = which(is.na(x[ ,i] ) )	
          if(length(ix) >0 )
            x[ix, i  ]  <- rowMedians[ix]
        }										
      }
      
      # missing for entire sample group, use median for all samples
      if(sum(is.na(x) )>0 ) { 
        rowMedians <- apply(x,1, function (y)  median(y,na.rm=T))
        for( i in 1:dim(x)[2] ) {
          ix = which(is.na(x[,i]) )
          x[ix,i] <- rowMedians[ix]						
        }						
      }
    }
  }
  
  # Compute kurtosis
  mean.kurtosis = mean(apply(x,2, kurtosis),na.rm=T)
  rawCounts = NULL
  pvals= NULL
  if (input_dataFileFormat == 2 ) { 
    
    if ( is.integer(x) ) dataTypeWarning = 1;
    
    x <- x[ which( apply( x, 1,  function(y) sum(y >= input_lowFilter)) >= input_NminSamples2 ) , ] 
    x <- x[which(apply(x,1, function(y) max(y)- min(y) ) > 0  ),]  # remove rows with all the same levels
    
    #Log transform
    # Takes log if log is selected OR kurtosis is big than 100
    if ( (input_transform == TRUE) | (mean.kurtosis > kurtosis.log ) ) 
      x = log(x+abs( input_logStart),2)
    
    tem <- apply(x,1,sd) 
    x <- x[order(-tem),]  # sort by SD
    
  } else 
    if( input_dataFileFormat == 1) {  # counts data
      
      # data not seems to be read counts
      if(!is.integer(x) & mean.kurtosis < kurtosis.log ) {
        dataTypeWarning = -1
      }
      # not used as some counts data like those from CRISPR screen
      #validate(   # if Kurtosis is less than a threshold, it is not read-count
      #	need(mean.kurtosis > kurtosis.log, "Data does not seem to be read count based on distribution. Please double check.")
      # )
      x <- round(x,0) # enforce the read counts to be integers. Sailfish outputs has decimal points.
      # remove IDs if it does not at least have minCounts in at least NminSamples
      x <- x[ which( apply( cpm(DGEList(counts = x)), 1,  
                            function(y) sum(y>=input_minCounts)) >= input_NminSamples ) , ] 
      
      rawCounts = x;
      # construct DESeqExpression Object
      colData = cbind(colnames(x), as.character(detectGroups( colnames(x) )) )
      tem = rep("A",dim(x)[2]); tem[1] <- "B"
      colData = cbind(colnames(x), tem )
      colnames(colData)  = c("sample", "groups")
      dds <- DESeqDataSetFromMatrix(countData = x, colData = colData, design = ~ groups)
      dds <- estimateSizeFactors(dds) # estimate size factor for use in normalization later for started log method
      
      
      # regularized log  or VST transformation
      if( input_CountsTransform == 3 ) { # rlog is slow, only do it with 10 samples
        if(dim(x)[2]<=20 ) { 
          x <- rlog(dds, blind=TRUE); x <- assay(x) } else 
            x <- log2( counts(dds, normalized=TRUE) + input_countsLogStart ) 
      }  
      
      else {
        if ( input_CountsTransform == 2 ) {
          x <- vst(dds, blind=TRUE)
          x <- assay(x)  
        } else{  # normalized by library sizes and add a constant.
          x <- log2( counts(dds, normalized=TRUE) + input_countsLogStart )   # log(x+c) 
          # This is equivalent to below. But the prior counts is more important
          #x <- cpm(DGEList(counts = x),log=TRUE, prior.count=input_countsLogStart )  #log CPM from edgeR
          #x <- x-min(x)  # shift values to avoid negative numbers
        }
      }
    } else 
      if( input_dataFileFormat == 3)	{ 
        n2 = ( dim(x)[2] %/% 2) 
        # It looks like it contains P values
        # ranges of columns add 0.2 and round to whole. For P value columns this should be 1
        tem = round( apply(x, 2, function( y) max(y)- min(y))  + .2)     
        if( sum(tem[(1:n2)*2  ] ==  1 ) == n2 | 
            sum(tem[(1:n2)*2-1  ] ==  1 ) == n2 ) { 		
          x = x[,1:(2*n2) ,drop=FALSE ] # if 5, change it to 4			
          if(tem[2] == 1) { # FDR follows Fold-change
            pvals = x [,2*(1:n2 ),drop=FALSE ]
            x = x[, 2*(1:n2 )-1,drop=FALSE] 
            
          } else {	# FDR follows Fold-change
            pvals = x [,2*(1:n2 )-1,drop=FALSE ] 
            x = x[, 2*(1: n2 ),drop=FALSE] 
          }
        }
        ix =  which(apply(x,1, function(y) max(y)- min(y) ) > 0  )
        x <- x[ix,]  # remove rows with all the same levels
        if(!is.null(pvals) )
          pvals = pvals[ix,]
        
      }
  
  sampleInfoDemo=NULL
  if( input_goButton >0)
    sampleInfoDemo <- t( read.csv(demoDataFile2,row.names=1,header=T,colClasses="character") )
  
  finalResult <- list(data = as.matrix(x), mean.kurtosis = mean.kurtosis, rawCounts = rawCounts, dataTypeWarning=dataTypeWarning, dataSize=c(dataSizeOriginal,dataSize),sampleInfoDemo=sampleInfoDemo, pvals =pvals )
  return(finalResult)
  
}
#Convenient funcion for read data annotation
readSampleInfo <- function(inFile){
  x <- read.csv(inFile,row.names=1,header=T,colClasses="character")	
  if(dim(x)[2] <= 2 )
    x <- read.table(inFile, row.names=1,sep="\t",header=TRUE,colClasses="character")
  colnames(x) = gsub("-","",colnames(x))
  colnames(x) = gsub("\\.","",colnames(x))	
  
  #Matching with column names of expression file
  ix = match(toupper(colnames(readData.out$data)), toupper(colnames(x)) ) 
  ix = ix[which(!is.na(ix))] # remove NA
  
  #Double check factor levels
  # remove "-" or "." from factor levels
  for( i in 1:dim(x)[1]) {
    x[i,] = gsub("-","",x[i,])
    x[i,] = gsub("\\.","",x[i,])				
  }
  # if levels from different factors match
  if( length(unique(ix) ) == dim(readData.out$data)[2]) { # matches exactly
    x = x[,ix]
    # if the levels of different factors are the same, it may cause problems
    if( sum( apply(x, 1, function(y) length(unique(y)))) > length(unique(unlist(x) ) ) ) {
      tem2 =apply(x,2, function(y) paste0( names(y),y)) # factor names are added to levels
      rownames(tem2) = rownames(x)
      x <- tem2				
    }
    return(t( x ) )			
  } else retrun(NULL)
  
  
}


# Clean up data, remove spaces and other control characters from IDs  
cleanSet <- function (x){
  # remove duplicate; upper case; remove special characters
  x <- unique( toupper( gsub("\n| ","",x) ) )
  x <- x[which( nchar(x)>1) ]  # IDs should have at least two characters
  return(x)
}
findSpeciesById <- function (speciesID){ # find species name use id
  return( orgInfo[which(orgInfo$id == speciesID),]  )
}

# just return name
findSpeciesByIdName <- function (speciesID){ # find species name use id
  return( orgInfo[which(orgInfo$id == speciesID),3]  )
}


convertedData <- function() {
  if( is.null(converted.out ) ) return( readData.out$data) # if id or species is not recognized use original data.
  
  if(input_noIDConversion) return( readData.out$data )
  
  mapping <- converted.out$conversionTable
  # cat (paste( "\nData:",input_selectOrg) )
  x =readData.out$data
  
  rownames(x) = toupper(rownames(x))
  # any gene not recognized by the database is disregarded
  # x1 = merge(mapping[,1:2],x,  by.y = 'row.names', by.x = 'User_input')
  x1 = merge(mapping[,1:2],x,  by.y = 'row.names', by.x = 'User_input', all.y=TRUE)
  
  # original IDs used if ID is not matched in database
  ix = which(is.na(x1[,2]) )
  x1[ix,2] = x1[ix,1] 
  
  #multiple matched IDs, use the one with highest SD
  tem = apply(x1[,3:(dim(x1)[2])],1,sd)
  x1 = x1[order(x1[,2],-tem),]
  x1 = x1[!duplicated(x1[,2]) ,]
  rownames(x1) = x1[,2]
  x1 = as.matrix(x1[,c(-1,-2)])
  tem = apply(x1,1,sd)
  x1 = x1[order(-tem),]  # sort again by SD
  return(x1)
  
}

wrapper_tests <- function(first_cohort,second_cohort,param) {
  effect_size <- cohen.d(first_cohort,second_cohort,param)
  write.csv(result,paste0(param,".csv"))
  wilcox <- wilcox.test(first_cohort,second_cohort,exact = FALSE)
  write.csv(wilcoz,paste0(param,".csv"))
  t_test <-t.test(first_cohort,second_cohort, "two.sided")
  write.csv(t_test,paste0(param,".csv"))
}

# Define sample groups based on column names
detectGroups <- function (x){  # x are col names
  tem <- gsub("[0-9]*$","",x) # Remove all numbers from end
  tem <- gsub("_$","",tem); 
  tem <- gsub("_Rep$","",tem); 
  tem <- gsub("_rep$","",tem); 
  tem <- gsub("_REP$","",tem) 
  return( tem )
}